第一節：描述性統計(1)

這節課我們終於要開始進行數據分析了，在最開始我們要簡單介紹幾個描述性統計的函數

– 請至這裡下載本週的範例資料

dat = read.csv("Example data.csv", header = TRUE)
head(dat)

##       eGFR Disease Survival.time Death Diabetes Cancer      SBP      DBP
## 1 34.65379       1     0.4771037     0        0      1 121.2353 121.3079
## 2 37.21183       1     3.0704424     0        1      1 122.2000 122.6283
## 3 32.60074       1     0.2607117     1        0      0 118.9136 121.7621
## 4 29.68481       1            NA    NA        0      0 118.2212 112.7043
## 5 28.35726       0     0.1681673     1        0      0 116.7469 115.7705
## 6 33.95012       1     1.2238556     0        0      0 119.9936 116.3872
##   Education Income
## 1         2      0
## 2         2      0
## 3         0      0
## 4         1      0
## 5         0      0
## 6         1      0

以下幾個函數為常見的描述性統計，分別是：

函數「mean()」可以幫助我們計算平均值

mean(dat$eGFR, na.rm = TRUE)

## [1] 34.14115

函數「sd()」可以幫助我們計算標準差

sd(dat$eGFR, na.rm = TRUE)

## [1] 7.062506

函數「var()」可以幫助我們計算變異數

var(dat$eGFR, na.rm = TRUE)

## [1] 49.87899

函數「median()」可以幫助我們計算變異數

median(dat$eGFR, na.rm = TRUE)

## [1] 34.33123

函數「quantile()」可以幫助我們計算百分位數

quantile(dat$eGFR, na.rm = TRUE)

##       0%      25%      50%      75%     100% 
## 10.00000 29.58404 34.33123 38.44965 60.00000

quantile(dat$eGFR, 0.5, na.rm = TRUE)

##      50% 
## 34.33123

quantile(dat$eGFR, 0.95, na.rm = TRUE)

##      95% 
## 45.86468

函數「min()」可以幫助我們找出最小值

min(dat$eGFR, na.rm = TRUE)

## [1] 10

函數「max()」可以幫助我們找出最大值

max(dat$eGFR, na.rm = TRUE)

## [1] 60

函數「table()」可以幫助我們產生列聯表

table(dat$Cancer)

## 
##   0   1 
## 576 402

table(dat$Cancer, dat$Diabetes)

##    
##       0   1
##   0 468  97
##   1 309  81

函數「prop.table()」可以幫助我們產生列聯表的百分比

tab1 = table(dat$Cancer)
prop.table(tab1)

## 
##         0         1 
## 0.5889571 0.4110429

tab2 = table(dat$Cancer, dat$Diabetes)
prop.table(tab2)

##    
##              0          1
##   0 0.49005236 0.10157068
##   1 0.32356021 0.08481675

prop.table(tab2, 1)

##    
##             0         1
##   0 0.8283186 0.1716814
##   1 0.7923077 0.2076923

prop.table(tab2, 2)

##    
##             0         1
##   0 0.6023166 0.5449438
##   1 0.3976834 0.4550562

第一節：描述性統計(2)

利用索引函數，我們將能得到某種條件下的描述性統計，舉例來說我們想獲得case組的eGFR平均數，可以使用下列語法

mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)

## [1] 34.41452

Control組的亦同

mean(dat[dat[,"Disease"] == 0,]$eGFR, na.rm = TRUE)

## [1] 33.19008

如果我們想要把平均數±標準差表示出來，可以利用函數「paste()」或函數「paste0()」：

paste(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), sep = "")

## [1] "34.4145222034604±7.1582590897065"

paste0(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE))

## [1] "34.4145222034604±7.1582590897065"

我們發現這樣的呈現相當醜，我們可以使用函數「formatC()」來指定我們想要的小數點位數

m = mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
s = sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
formatC(m, digits = 3, format = "f")

## [1] "34.415"

formatC(s, digits = 3, format = "f")

## [1] "7.158"

paste0(formatC(m, digits = 3, format = "f"), "±", formatC(s, digits = 3, format = "f"))

## [1] "34.415±7.158"

練習1：方便的自訂函數

現在請利用自訂函數功能，寫出兩個函數，規格如下：

函數之input必須為一類別變項及一連續變項
函數中必須設定一個變項，讓使用者能決定顯示小數點的位數

3-1. 第一個函數必須輸出在該類別變項為不同類別時，該連續變項之『平均數±標準差』

3-2. 第二個函數必須輸出在該類別變項為不同類別時，該連續變項之『中位數(25百分位-75百分位)』

– 注意：請考慮類別數不只是2的情形，以及類別項目並非為0、1、2、…的狀況

練習1答案

這題的關鍵是要考慮到類別變項的數目，再把他包成函數：

– 剩下的其實就是把前面的東西貼起來：

my_func.1 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  result = character(n.lvl.x)
  
  for (i in 1:n.lvl.x) {
    m = mean(y[x == lvl.x[i]], na.rm = TRUE)
    s = sd(y[x == lvl.x[i]], na.rm = TRUE)
    result[i] = paste0(formatC(m, digits = dig, format = "f"), "±", formatC(s, digits = dig, format = "f"))
  }
  
  result
  
}

my_func.2 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  result = character(n.lvl.x)
  
  for (i in 1:n.lvl.x) {
    m = median(y[x == lvl.x[i]], na.rm = TRUE)
    q1 = quantile(y[x == lvl.x[i]], 0.25, na.rm = TRUE)
    q2 = quantile(y[x == lvl.x[i]], 0.75, na.rm = TRUE)
    result[i] = paste0(formatC(m, digits = dig, format = "f"), "(", formatC(q1, digits = dig, format = "f"), "-", formatC(q2, digits = dig, format = "f"), ")")
  }
  
  result
  
}

讓我們實際使用一下：

my_func.1(dat$Education, dat$SBP)

## [1] "121.365±10.568" "121.406±10.920" "122.853±10.532"

my_func.2(dat$Income, dat$eGFR, dig = 1)

## [1] "34.3(29.5-38.2)" "33.8(28.9-38.8)" "34.9(30.4-40.0)"

第二節：列表(List)層物件基本介紹(1)

在之後的課程開始之前，我們要先介紹一下R裡面的列表(List)層物件(物件簡介在Lesson 1的第7頁)

– 列表(List)層分為列表(list)、S3物件(S3 class)及S4物件(S4 class)：

列表(list)：在R裡面，向量的上層是陣列層物件。若是我們希望在一個物件內放置很多陣列層物件，我們會用到列表。値得一提的是，列表裡面可以同時包含數個陣列層物件及變數層物件。
S3物件(S3 class)：S3物件是一種特殊的列表物件，他的變化會在後面慢慢介紹。
S4物件(S4 class)：S4物件與前面兩種有非常大的不同，相關的函數也不一樣，在本節課我們不會教到。

我們首先介紹列表(list)物件

# 先產生一個數値矩陣物件
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
M1

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20

# 再產生一個文字矩陣物件
x2 = c("A", "B", "C", "A", "C", "B", "B", "B", "A")
M2 = matrix(x2, nrow = 3, ncol = 3)
M2

##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"

# 再產生一個邏輯向量
x3 = c(TRUE, FALSE, TRUE, FALSE)
x3

## [1]  TRUE FALSE  TRUE FALSE

# 將上述這些物件打包成一個列表物件
L1 = list(M1, M2, x3)
L1

## [[1]]
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## [[2]]
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## [[3]]
## [1]  TRUE FALSE  TRUE FALSE

第二節：列表(List)層物件基本介紹(2)

列表(List)層物件中有幾個重要的函數來描述它的基本資料：

函數「length()」可以協助我們了解物件長度
函數「class()」可以查詢該物件的屬性
函數「names()」可以協助我們命名物件
函數「ls()」可以協助我們看看物件中有哪些東西

length(L1)

## [1] 3

class(L1)

## [1] "list"

names(L1) = c("A", "B", "C")
L1

## $A
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## $B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## $C
## [1]  TRUE FALSE  TRUE FALSE

ls(L1)

## [1] "A" "B" "C"

第二節：列表(List)層物件基本介紹(3)

在R裡面所有的索引函數都由中括號形成，列表(List)層物件中的索引函數是使用[[]]，我們直接在中括號內填入數字就好。另外，索引函數「$」一樣能用在陣列層的物件

L1[[2]]

##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"

L1[["B"]]

##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"

L1$B

##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A"

如果我們想要進一步再找藏在L1裡面的B裡面的元素，我們可以繼續使用索引函數

L1[[2]][2,3]

## [1] "B"

L1[["B"]][3,1]

## [1] "C"

L1$B[1,2]

## [1] "A"

第二節：列表(List)層物件基本介紹(4)

經過了上述的示範後，我們了解到列表(list)是一個很方便的物件，它可以把很多很雜的東西丟在同個物件內。但東西多了以後會遇到問題，那就是該列表物件會變的非常非常大，但也許我們想要呈現的東西是很有限的，在R裡面，列表有一種擴展型態叫做S3物件(S3 class)，它可以解決這個問題。
S3物件(S3 class)的產生方式如下

#先看看L1的樣子
L1

## $A
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## $B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## $C
## [1]  TRUE FALSE  TRUE FALSE

#先看看L1的物件屬性
class(L1)

## [1] "list"

#強迫L1成為別的物件屬性
class(L1) = "test"
#再看看L1的物件屬性
class(L1)

## [1] "test"

#看看L1現在的樣子
L1

## $A
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20
## 
## $B
##      [,1] [,2] [,3]
## [1,] "A"  "A"  "B" 
## [2,] "B"  "C"  "B" 
## [3,] "C"  "B"  "A" 
## 
## $C
## [1]  TRUE FALSE  TRUE FALSE
## 
## attr(,"class")
## [1] "test"

– 小提示：當你使用函數「class()」可以查詢該物件的屬性，若非常見的幾種屬性名稱，那就非常有可能是S3物件(S3 class)或S4物件(S4 class)

第二節：列表(List)層物件基本介紹(5)

轉變為S3物件(S3 class)後，我們可以透過自訂函數「print.XXX()」、自訂函數「summary.XXX()」等函數來讓R知道要如何呈現這個S3物件

#先寫一個自訂函數「print.test()」
print.test = function(test) {
  cat("此列表共有",length(test),"個物件\n")
  cat("物件名稱分別為：\n")
  cat(paste(names(test), collapse = ", "), "\n")
}

#再看看請R列印出L1會變什麼
L1

## 此列表共有 3 個物件
## 物件名稱分別為：
## A, B, C

– 列表(list)的幾個常見函數還是能夠使用：

ls(L1)

## [1] "A" "B" "C"

length(L1)

## [1] 3

class(L1)

## [1] "test"

names(L1) = c("D", "E", "F")
L1

## 此列表共有 3 個物件
## 物件名稱分別為：
## D, E, F

第二節：列表(List)層物件基本介紹(6)

接著我們還能寫自訂函數「summary.XXX()」來呈現更詳細的資料

– 在寫之前我們先看看直接對L1使用函數「summary()」會怎樣

summary(L1)

##   Length Class  Mode     
## D 20     -none- numeric  
## E  9     -none- character
## F  4     -none- logical

– 現在我們可以讓函數「summary()」使用後產生不同的結果

#先寫一個自訂函數「summary.test()」
summary.test = function(test) {
  cat("此列表共有",length(test),"個物件\n")
  cat("物件名稱分別為：\n")
  cat(paste(names(test), collapse = ", "), "\n")
  for (i in 1:length(test)) {
    cat(names(test)[i], "之物件屬性為", class(test[[i]]), "\n")
  }
}

#再看看使用函數「summary()」後會變什麼
summary(L1)

## 此列表共有 3 個物件
## 物件名稱分別為：
## D, E, F 
## D 之物件屬性為 matrix 
## E 之物件屬性為 matrix 
## F 之物件屬性為 logical

練習2：S3物件練習

我們已經學會如何將想要的資訊放在列表(list)物件中，並透過將這個物件轉換為一個特定的S3物件(S3 class) 後，就可以透過自訂函數「print.XXX()」呈現想要的結果。
現在請延續練習-1的功課，我們要額外增加一些要求：

平均數必須儲存在列表(list)物件的第一個位置，並且必須儲存為一『數値變數』，長度不限
標準差必須儲存在列表(list)物件的第二個位置，並且必須儲存為一『數値變數』，長度不限
各組別樣本數必須儲存在列表(list)物件的第三個位置，並且必須儲存為一『數値變數』，長度不限
列表(list)物件的第四個位置必須放置『平均數±標準差』，並且必須儲存為一『文字變數』，長度不限
列表(list)物件的第五個位置必須放置當初輸入的類別變項之『各類別名稱』，並且必須儲存為一『文字變數』，長度不限
在使用函數「print()」時，我想看到連續幾行的『組別：‘XXX’之平均數±標準差為’XXX±XXX’(個案數=XXX)』。

練習2答案

這題的難度很高，首先是我們要先會使用list這個物件放東西，最後是我們需要定義一個S3 class的函數：

my_func.S3 = function (x, y, dig = 3) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  my_list = list()
  my_list[[1]] = numeric(n.lvl.x)
  my_list[[2]] = numeric(n.lvl.x)
  my_list[[3]] = numeric(n.lvl.x)
  my_list[[4]] = character(n.lvl.x)
  my_list[[5]] = character(n.lvl.x)
  
  names(my_list) = c("mean", "sd", "n", "description", "name")
  
  for (i in 1:n.lvl.x) {
    current_vec = y[x == lvl.x[i]]
    my_list[[1]][i] = mean(current_vec, na.rm = TRUE)
    my_list[[2]][i] = sd(current_vec, na.rm = TRUE)
    my_list[[3]][i] = sum(table(current_vec))
    my_list[[4]][i] = paste0(formatC(my_list[[1]][i], digits = 3, format = "f"), "±", formatC(my_list[[2]][i], digits = 3, format = "f"))
    my_list[[5]][i] = lvl.x[i]
  }
  
  class(my_list) = 'myS3'
  my_list
  
}

print.myS3 = function (x) {
  
  n.lvl.x = length(x[["name"]])
  
  for (i in 1:n.lvl.x) {
    cat(paste0("組別：", x[["name"]][i], "之平均數±標準差為", x[["description"]][i], "(個案數=", x[["n"]][i], ")\n"))
  }
  
}

讓我們實際使用一下：

my_list = my_func.S3(dat$Education, dat$SBP)

ls(my_list)

## [1] "description" "mean"        "n"           "name"        "sd"

my_list

## 組別：0之平均數±標準差為121.365±10.568(個案數=488)
## 組別：1之平均數±標準差為121.406±10.920(個案數=284)
## 組別：2之平均數±標準差為122.853±10.532(個案數=184)

第三節：t test and Wilcoxon test(1)

在自變項為2類別，依變相為連續變項時，我們最常使用t test及Wilcoxon test來進行檢定

– 這時候我們需要使用函數「t.test()」及「wilcox.test()」

我們使用糖尿病有無做分組，檢定他們eGFR的差異，分別使用t test及Wilcoxon test

result1 = t.test(dat[,"eGFR"]~dat[,"Diabetes"], var.equal = FALSE)
result1

## 
##  Welch Two Sample t-test
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## t = -12.506, df = 254.94, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.967643 -5.799633
## sample estimates:
## mean in group 0 mean in group 1 
##        32.87223        39.75587

result2 = wilcox.test(dat[,"eGFR"]~dat[,"Diabetes"])
result2

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## W = 29815, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

第三節：t test and Wilcoxon test(2)

知道為什麼直接列印出result1及result2會出現這麼多文字吧?因為函數「t.test()」及「wilcox.test()」輸出的結果都是S3物件

class(result1)

## [1] "htest"

class(result2)

## [1] "htest"

當然，如果我們可以使用函數「ls()」看看這兩個物件裡面有多少東西

ls(result1)

## [1] "alternative" "conf.int"    "data.name"   "estimate"    "method"     
## [6] "null.value"  "parameter"   "p.value"     "statistic"

ls(result2)

## [1] "alternative" "data.name"   "method"      "null.value"  "parameter"  
## [6] "p.value"     "statistic"

如果我們最需要的是裡面的p value，各位知道要怎麼取得吧?

第三節：t test and Wilcoxon test(3)

t test有分成兩種：『變異數同質』及『變異數不同質』。一般來說我們會先做變異數同值檢定，我們可以透過函數「var.test()」來做這個檢定

var.result = var.test(dat[,"eGFR"]~dat[,"Diabetes"])
var.result

## 
##  F test to compare two variances
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## F = 1.0183, num df = 767, denom df = 171, p-value = 0.8997
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.7979204 1.2767309
## sample estimates:
## ratio of variances 
##           1.018323

看起來變異數是同質的，因此其實我們應該使用的是變異數同質的t test：

result3 = t.test(dat[,"eGFR"]~dat[,"Diabetes"], var.equal = TRUE)
result3

## 
##  Two Sample t-test
## 
## data:  dat[, "eGFR"] by dat[, "Diabetes"]
## t = -12.434, df = 938, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -7.970133 -5.797143
## sample estimates:
## mean in group 0 mean in group 1 
##        32.87223        39.75587

練習3：自訂分析流程

現在請寫一個自訂函數，規格如下：

函數之input必須為一『類別變項』及一『連續變項』
先做一個檢查，如果該『類別變項』之類別數不等於2，則不執行函數或產生警告
先檢查兩組個案數是否皆>25，如果是就做t test，否則就做Wilcoxon test。
如果選擇做做t test，先確定變異數是否同質，並依據變異數同值性檢定的結果，決定要使用哪一種t test。

練習3答案

這題的關鍵在於是否能夠妥善的處理每個步驟的條件判斷：

my_func.analysis = function (x, y) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  if (n.lvl.x != 2) {
    
    cat("『類別變項』之類別數不等於2\n")
    
  } else {
    
    n.each = numeric(n.lvl.x)
    
    for (i in 1:n.lvl.x) {
      
      current_vec = y[x == lvl.x[i]]
      n.each[i] = sum(table(current_vec))
      
    }
    
    if (FALSE %in% (n.each > 25)) {
      
      result = wilcox.test(y~x)
      
    } else {
      
      var.result = var.test(y~x)
      
      if (var.result[['p.value']] < 0.05) {
        
        result = t.test(y~x, var.equal = FALSE)
        
      } else {
        
        result = t.test(y~x, var.equal = TRUE)
        
      }
      
    }
    
    result
  
  }
  
}

讓我們實際使用一下：

my_func.analysis(dat$Income, dat$eGFR)

## 『類別變項』之類別數不等於2

my_func.analysis(dat$Disease, dat$eGFR)

## 
##  Welch Two Sample t-test
## 
## data:  y by x
## t = -2.4556, df = 466.06, p-value = 0.01443
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.2043011 -0.2445876
## sample estimates:
## mean in group 0 mean in group 1 
##        33.19008        34.41452

my_func.analysis(dat[dat$Income == 2,]$Disease, dat[dat$Income == 2,]$eGFR)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  y by x
## W = 691, p-value = 0.2357
## alternative hypothesis: true location shift is not equal to 0

第四節：ANOVA and Kruskal-Wallis test(1)

若自變項為3類別以上時，依變相為連續變項時，我們通常會使用ANOVA and Kruskal-Wallis test來進行檢定

我們需要使用函數「aov()」及「anova()」執行ANOVA
函數「kruskal.test()」可以幫助我們做Kruskal-Wallis test

我們使用教育程度有無做分組，檢定他們eGFR的差異，分別使用ANOVA and Kruskal-Wallis test

Variance.table = aov(dat[,"eGFR"]~as.factor(dat[,"Education"]))
Variance.table

## Call:
##    aov(formula = dat[, "eGFR"] ~ as.factor(dat[, "Education"]))
## 
## Terms:
##                 as.factor(dat[, "Education"]) Residuals
## Sum of Squares                         213.93  47107.75
## Deg. of Freedom                             2       946
## 
## Residual standard error: 7.056683
## Estimated effects may be unbalanced
## 51 observations deleted due to missingness

ANOVA.table = anova(Variance.table)
ANOVA.table

## Analysis of Variance Table
## 
## Response: dat[, "eGFR"]
##                                Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(dat[, "Education"])   2    214 106.966   2.148 0.1173
## Residuals                     946  47108  49.797

KW.result = kruskal.test(dat[,"eGFR"]~dat[,"Education"])
KW.result

## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat[, "eGFR"] by dat[, "Education"]
## Kruskal-Wallis chi-squared = 4.519, df = 2, p-value = 0.1044

聰明的你一定想的到，如果我們最關心的是p value，要怎樣拿到p value呢?

練習4：加強分析流程

請延續練習-3的題目，現在若類別變項為2類別時就使用練習-3的函數，但當類別數>=3時，請使用下流程進行分析：

先檢查類別數是否>=3，若是則使用ANOVA或Kruskal-Wallis test，否則則使用t test或Wilcoxon test。
檢查各組內個案數是否皆>25，如果是就做ANOVA，否則就做Kruskal-Wallis test。

練習4答案

這題的答案如下：

my_func.analysis = function (x, y) {
  
  lvl.x = levels(factor(x))
  n.lvl.x = length(lvl.x)
  
  n.each = numeric(n.lvl.x)
  
  for (i in 1:n.lvl.x) {
    
    current_vec = y[x == lvl.x[i]]
    n.each[i] = sum(table(current_vec))
    
  }
  
  
  if (n.lvl.x > 2) {
    
    if (FALSE %in% (n.each > 25)) {
      
      result = kruskal.test(y~x)

    } else {
      
      Variance.table = aov(y~as.factor(x))
      result = anova(Variance.table)
      
    }
    
  } else {
    
    if (FALSE %in% (n.each > 25)) {
      
      result = wilcox.test(y~x)
      
    } else {
      
      var.result = var.test(y~x)
      
      if (var.result[['p.value']] < 0.05) {
        
        result = t.test(y~x, var.equal = FALSE)
        
      } else {
        
        result = t.test(y~x, var.equal = TRUE)
        
      }
      
    }
    
  }
  
  result
  
}

讓我們實際使用一下：

my_func.analysis(dat$Income, dat$eGFR)

## Analysis of Variance Table
## 
## Response: y
##               Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(x)   2    102  51.016  1.0115 0.3641
## Residuals    944  47613  50.438

my_func.analysis(dat[dat$Income == 2,]$Education, dat[dat$Income == 2,]$eGFR)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  y by x
## Kruskal-Wallis chi-squared = 8.7301, df = 2, p-value = 0.01271

家庭作業

本週的作業為結合練習-2的作品及練習-4的作品，當使用者給定一個類別變項及一個連續變項時，請製作一個列表儲存該儲存的數質，並將這個列表轉換為S3物件，讓使用者在print出來時可以看到類似於下列範例的東西

– 以下範例為自變項為糖尿病，依變項為eGFR。

##   Variable Group:0     Group:1     p-value 
## 1 "eGFR"   "32.9±6.6" "39.8±6.5" "<0.001"

如果你不知道如何能將p value顯示為<0.001，你可以使用這個自訂函數「.ROUND.p()」

.ROUND.p = function(X, p.digits=3) {
  p=rep(NA,length(X))
  for (a in 1:length(X)) {
    if (!is.na(X[a])) {
      if (p.digits==0) {p[a]="<1"} else {
        p[a]=as.character(round(X[a],p.digits))
        if (p[a]=="0") {
          if (p.digits==1) {p[a]="<0.1"} else {
            p[a]="<0."
            for (l in 1:(p.digits-1)) {
              p[a]=paste0(p[a],0)
            }
            p[a]=paste0(p[a],1)  
          }
        } else {
          p[a]=formatC(as.numeric(X[a]), format="f", digits = p.digits)
        }
      }
    }
  }
  return(p)
}

.ROUND.p(0.000518, 2)

## [1] "<0.01"

.ROUND.p(0.000518, 3)

## [1] "0.001"

.ROUND.p(0.000518, 4)

## [1] "0.0005"

資料分析1